USMLE Data Analysis

Author

Lisa Levoir

Published

October 26, 2023

0.0.1 Setup

Criteria to exclude students who most likely took a scored exam:

  1. Any PhD students (n = 2)

  2. Any 5th year program students (n = 19)

  3. M4 students at Vanderbilt (n = 5)

  4. Students who did not complete either Step survey (n = 2)

  5. Students who specifically stated they took a scored Step 1 (n=1)

Based on our criteria we would exclude record IDs: 

  • VUSM: 23, 26, 39, 40, 54, 3, 8, 12, 49, 60, 62, 64 

  • HMS: 1, 21, 28, 30, 34, 37, 39, 41, 44, 47, 49, 61 

  • UVA: 33, 47, 80, 81, 83 

#the list of IDs we decided as a group to exclude
excludeVU <- c(23, 26, 39, 40, 54, 3, 8, 12, 49, 60, 62, 64)
excludeHMS <- c(1, 21, 28, 30, 34, 37, 39, 41, 44, 47, 49, 61)
excludeUVA <- c(33, 47, 80, 81, 83)

'%!in%' <- function(x,y)!('%in%'(x,y)) #make a way to use the not in command
VU_in <- filter(VUMSdat, record_id  %!in%  excludeVU)
H_in <- filter(HMSdat, record_id  %!in%  excludeHMS)
UVA_in <- filter(UVAdat, record_id  %!in%  excludeUVA)

#now I want to select the columns I'd like to include for all of my analysis (so they're in the proper order for a cbind). this will be relatively easy to come back to edit later, if needed. 

#first, remember to include a school identifier
VU_in[,"schoolid"] <- "VU" 
UVA_in[,"schoolid"] <- "UVa"
H_in[,"schoolid"] <- "HMS"

############ plan for how I will get the data in a format I want:
# - pull relevant columns by "starts with"
# - confirm all column names match, then
# - rbind together once
# - then I can select from this sheet the questions relevant to Step 1 first with ends with "_1", and those who took Step 1 second with a "_2"

############ Now pulling the common columns we're interested in as predictors and outcomes
##note, for VU I removed ""number_other_courses_step1_1" and starts_with("other_courses_step1_1___1) because these questions were not on the other school surveys
took_step1_VU <-
  VU_in %>% select(
    starts_with("uworld_percent_step1"),
    starts_with("amboss_percent_step1"),
    starts_with("length_step1"),
    starts_with("practicetest_step1"),
    starts_with("full_test_practice_step1"),
    starts_with("push_step1"),
    starts_with("push_practice_test_step1"),
    starts_with("push_nbme_practice_score_step1"),
    starts_with("push_uw_practice_score_step1"),
    starts_with("final_nbme_practice_score_step1"),
    starts_with("final_uw_practice_score_step1"),
    starts_with("score_step1"),
    "schoolid"
  ) 

took_step1_UVA <-
  UVA_in %>% select(
    starts_with("uworld_percent_step1"),
    starts_with("amboss_percent_step1"),
    starts_with("length_step1"),
    starts_with("practicetest_step1"),
    starts_with("full_test_practice_step1"),
    starts_with("push_step1"),
    starts_with("push_practice_test_step1"),
    starts_with("push_nbme_practice_score_step1"),
    starts_with("push_uw_practice_score_step1"),
    starts_with("final_nbme_practice_score_step1"),
    starts_with("final_uw_practice_score_step1"),
    starts_with("score_step1"),
    "schoolid"
  )

took_step1_H <-
  H_in %>% select(
    starts_with("uworld_percent_step1"),
    starts_with("amboss_percent_step1"),
    starts_with("length_step1"),
    starts_with("practicetest_step1"),
    starts_with("full_test_practice_step1"),
    starts_with("push_step1"),
    starts_with("push_practice_test_step1"),
    starts_with("push_nbme_practice_score_step1"),
    starts_with("push_uw_practice_score_step1"),
    starts_with("final_nbme_practice_score_step1"),
    starts_with("final_uw_practice_score_step1"),
    starts_with("score_step1"),
    "schoolid"
  )

## now rbinding the three schools together
took_step1general <- rbind(took_step1_H, took_step1_UVA, took_step1_VU)

## splitting the dataset so I also have reference sheets specific to step 1 first and step 1 second
took_step1_first <- took_step1general %>% select(ends_with("_1"), "schoolid")
took_step1_second <- took_step1general %>% select(ends_with("_2"), "schoolid")

# profiling missing data (must discuss!!)
visdat::vis_miss(took_step1general) 

visdat::vis_miss(took_step1_first) #looks like a school specific trend with who is answering what question...

visdat::vis_miss(took_step1_second) #looks like a school specific trend with who is answering what question...

#these are the variables I can actually analyze
describe(VU_in$score_step2_2) 
VU_in$score_step2_2 
       n  missing distinct 
      28       25       19 
                                                                            
Value        239   243   244   252   254   255   256   258   260   261   263
Frequency      1     1     1     1     2     2     1     2     2     2     1
Proportion 0.036 0.036 0.036 0.036 0.071 0.071 0.036 0.071 0.071 0.071 0.036
                                                          
Value        264   265   267   268   270   271   273   275
Frequency      3     1     3     1     1     1     1     1
Proportion 0.107 0.036 0.107 0.036 0.036 0.036 0.036 0.036
describe(VU_in$score_step2_1)
VU_in$score_step2_1 
       n  missing distinct 
       7       46        6 
                                              
Value        255   264   266   271   272   Nah
Frequency      1     1     1     1     2     1
Proportion 0.143 0.143 0.143 0.143 0.286 0.143
took_step2_VU <-
  VU_in %>% select(
    starts_with("uworld_percent_step2"),
    starts_with("amboss_percent_step2"),
    starts_with("length_step2"),
    starts_with("practicetest_step2"),
    starts_with("full_test_practice_step2"),
    starts_with("practice_score_step2"),
    starts_with("score_step2"),
    starts_with("target_score_step2"),
    "schoolid"
  ) #note, I removed ""number_other_courses_step1_1" and starts_with("other_courses_step1_1___1) because these questions were not on the other school surveys

took_step2_UVA <-
  UVA_in %>% select(
    starts_with("uworld_percent_step2"),
    starts_with("amboss_percent_step2"),
    starts_with("length_step2"),
    starts_with("practicetest_step2"),
    starts_with("full_test_practice_step2"),
    starts_with("practice_score_step2"),
    starts_with("score_step2"),
    starts_with("target_score_step2"),
    "schoolid"
  )

took_step2_H <-
  H_in %>% select(
    starts_with("uworld_percent_step2"),
    starts_with("amboss_percent_step2"),
    starts_with("length_step2"),
    starts_with("practicetest_step2"),
    starts_with("full_test_practice_step2"),
    starts_with("practice_score_step2"),
    starts_with("score_step2"),
    starts_with("target_score_step2"),
    "schoolid"
  )

took_step2general <- rbind(took_step2_H, took_step2_UVA, took_step2_VU)
took_step2_first <- took_step2general %>% select(ends_with("_1"), "schoolid")
took_step2_second <- took_step2general %>% select(ends_with("_2"), "schoolid")
visdat::vis_miss(took_step2_first) #looks like a school specific trend with who is answering what question...

visdat::vis_miss(took_step2_second) #looks like a school specific trend with who is answering what question...

#this function is so I can customize the panels in the correlation pairs plots. 
tf = function(x,y) {
  ct <- cor.test(x, y)
  sprintf("Corr: %0.2f, p: %0.3f\n95%% CI: (%0.2f - %0.2f) \n n pairs = %.0f", 
          ct$estimate, ct$p.value, ct$conf.int[1], ct$conf.int[2], sum(!is.na(x), !is.na(y))/2)
}  #https://stackoverflow.com/questions/75068871/add-p-value-and-or-95-ci-in-the-text-of-a-correlation-matrix-pairs-plot

#I made a pairs plot but clearly some variable classes need to be changed
class(took_step2_second$practice_score_step2_2) <- "integer"
class(took_step2_second$score_step2_2) <- "integer"
ggpairs(took_step2_second, title="Pairs plots", upper=list(continuous=wrap("statistic",text_fn=tf,title=NULL, sep=NULL)), progress = FALSE) #well this means it's time to take a break

str(took_step2general)
'data.frame':   182 obs. of  17 variables:
 $ uworld_percent_step2_1    : int  90 NA NA NA NA NA NA NA NA NA ...
 $ uworld_percent_step2_2    : int  NA 100 74 60 100 95 NA NA 100 90 ...
 $ amboss_percent_step2_1    : int  NA NA NA NA NA NA NA NA NA NA ...
 $ amboss_percent_step2_2    : int  NA NA 15 NA 5 NA NA NA NA NA ...
 $ length_step2_1            : int  5 NA NA NA NA NA NA NA NA NA ...
 $ length_step2_2            : int  NA 4 3 4 5 4 NA NA 4 4 ...
 $ practicetest_step2_1      : chr  "5" NA NA NA ...
 $ practicetest_step2_2      : chr  NA "4" "4" "7" ...
 $ full_test_practice_step2_1: int  1 NA NA NA NA NA NA NA NA NA ...
 $ full_test_practice_step2_2: int  NA 4 1 4 4 4 NA NA 3 4 ...
 $ practice_score_step2_1    : int  266 NA NA NA NA NA NA NA NA NA ...
 $ practice_score_step2_2    : chr  NA "251" "260" NA ...
 $ score_step2_1             : chr  "264" NA NA NA ...
 $ score_step2_2             : chr  NA "262" "264" "269" ...
 $ target_score_step2_1      : int  1 NA NA NA NA NA NA NA NA NA ...
 $ target_score_step2_2      : int  NA 2 2 2 1 2 NA NA 1 2 ...
 $ schoolid                  : chr  "HMS" "HMS" "HMS" "HMS" ...

0.1 what about profiling the data we have for if they pushed back an exam?

It doesn’t look like there is a question related to pushing back step 2, just step 1. Is that okay?

describe(took_step1general$push_step1_1)
took_step1general$push_step1_1 
       n  missing distinct     Info     Mean      Gmd 
      88       94        2    0.302    1.886   0.2038 
                      
Value          1     2
Frequency     10    78
Proportion 0.114 0.886
describe(took_step1general$push_step1_2)
took_step1general$push_step1_2 
       n  missing distinct     Info     Mean      Gmd 
      66      116        2    0.386    1.848   0.2611 
                      
Value          1     2
Frequency     10    56
Proportion 0.152 0.848

1 data cleaning note (discuss as a group if this is a useful data cleaning path anyways)

this was another tactic I was trying but frustratingly, the class of different columns was changed between surveys. So far, I have abandoned this technique in favor of the above, where I am affirmativly selecting the columns (but I still may run into the different class issue)

#try dropping problematic columns and then do bind_rows
# removeextracolsH <-
#   H_in %>% select(
#     -c(
#       "vusmhms_usmle_trends_survey_timestamp",
#       "vusmhms_usmle_trends_survey_complete"
#     ),
#     contains("number_other_courses")
#   )
# setdiff(names(H_in), names(UVA_in))
# setdiff(names(removeextracolsH), names(UVA_in))
# 
# removeextracolsUVA <-
#   UVA_in %>% select(
#     -contains("number_other_courses"),
#     -c(
#       "vusmuva_usmle_trends_survey_timestamp",
#       "vusmuva_usmle_trends_survey_complete"
#     )
#   )
# setdiff(names(removeextracolsUVA), names(removeextracolsH)) #confirming the cols match now
# 
# removeextracolsVU <- VU_in %>% select(-contains("number_other_courses"), -contains("other_courses"))
# setdiff(names(removeextracolsUVA), names(removeextracolsVU)) #confirming the cols match now
# 
# class(removeextracolsH$practicetest_step1_1) <- "integer" #this column was coded as a character so it wouldn't merge previously
# bind_rows(list(removeextracolsVU, removeextracolsH), .id = "id")

1.0.1 remember to profile missingness for students who conducted only one Step Survey